## Importing relevant Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install seaborn
import seaborn as sns
import plotly.graph_objects as go
import warnings
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import fbeta_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import *
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
#profiling
import pandas_profiling
from pandas_profiling import ProfileReport
Requirement already satisfied: seaborn in ./opt/anaconda3/lib/python3.9/site-packages (0.11.2) Requirement already satisfied: numpy>=1.15 in ./opt/anaconda3/lib/python3.9/site-packages (from seaborn) (1.21.5) Requirement already satisfied: scipy>=1.0 in ./opt/anaconda3/lib/python3.9/site-packages (from seaborn) (1.7.3) Requirement already satisfied: matplotlib>=2.2 in ./opt/anaconda3/lib/python3.9/site-packages (from seaborn) (3.5.1) Requirement already satisfied: pandas>=0.23 in ./opt/anaconda3/lib/python3.9/site-packages (from seaborn) (1.4.2) Requirement already satisfied: fonttools>=4.22.0 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib>=2.2->seaborn) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib>=2.2->seaborn) (1.3.2) Requirement already satisfied: pyparsing>=2.2.1 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib>=2.2->seaborn) (3.0.4) Requirement already satisfied: python-dateutil>=2.7 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib>=2.2->seaborn) (2.8.2) Requirement already satisfied: cycler>=0.10 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib>=2.2->seaborn) (0.11.0) Requirement already satisfied: packaging>=20.0 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib>=2.2->seaborn) (21.3) Requirement already satisfied: pillow>=6.2.0 in ./opt/anaconda3/lib/python3.9/site-packages (from matplotlib>=2.2->seaborn) (9.0.1) Requirement already satisfied: pytz>=2020.1 in ./opt/anaconda3/lib/python3.9/site-packages (from pandas>=0.23->seaborn) (2021.3) Requirement already satisfied: six>=1.5 in ./opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib>=2.2->seaborn) (1.12.0)
!pip install --upgrade pip
Requirement already satisfied: pip in ./opt/anaconda3/lib/python3.9/site-packages (23.0.1)
#Loading datasets
Train = pd.read_csv('/Users/Admin/Desktop/Churn Capstone/Train.csv')
Test = pd.read_csv('/Users/Admin/Desktop/Churn Capstone/Test.csv')
Train.head()
| user_id | REGION | TENURE | MONTANT | FREQUENCE_RECH | REVENUE | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | ON_NET | ORANGE | TIGO | ZONE1 | ZONE2 | MRG | REGULARITY | TOP_PACK | FREQ_TOP_PACK | CHURN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7ee9e11e342e27c70455960acc80d3f91c1286d1 | DAKAR | K > 24 month | 20000.0 | 47.0 | 21602.0 | 7201.0 | 52.0 | 8835.0 | 3391.0 | 396.0 | 185.0 | NaN | NaN | NO | 62 | On net 200F=Unlimited _call24H | 30.0 | 0 |
| 1 | 50443f42bdc92b10388fc56e520e4421a5fa655c | NaN | K > 24 month | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NO | 3 | NaN | NaN | 0 |
| 2 | da90b5c1a9b204c186079f89969aa01cb03c91b2 | NaN | K > 24 month | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NO | 1 | NaN | NaN | 0 |
| 3 | 364ec1b424cdc64c25441a444a16930289a0051e | SAINT-LOUIS | K > 24 month | 7900.0 | 19.0 | 7896.0 | 2632.0 | 25.0 | 9385.0 | 27.0 | 46.0 | 20.0 | NaN | 2.0 | NO | 61 | Data:490F=1GB,7d | 7.0 | 0 |
| 4 | d5a5247005bc6d41d3d99f4ef312ebb5f640f2cb | DAKAR | K > 24 month | 12350.0 | 21.0 | 12351.0 | 4117.0 | 29.0 | 9360.0 | 66.0 | 102.0 | 34.0 | NaN | NaN | NO | 56 | All-net 500F=2000F;5d | 11.0 | 0 |
Train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1077024 entries, 0 to 1077023 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 1077024 non-null object 1 REGION 652687 non-null object 2 TENURE 1077024 non-null object 3 MONTANT 699139 non-null float64 4 FREQUENCE_RECH 699139 non-null float64 5 REVENUE 714669 non-null float64 6 ARPU_SEGMENT 714669 non-null float64 7 FREQUENCE 714669 non-null float64 8 DATA_VOLUME 547261 non-null float64 9 ON_NET 683850 non-null float64 10 ORANGE 629880 non-null float64 11 TIGO 432250 non-null float64 12 ZONE1 84898 non-null float64 13 ZONE2 68794 non-null float64 14 MRG 1077024 non-null object 15 REGULARITY 1077024 non-null int64 16 TOP_PACK 626129 non-null object 17 FREQ_TOP_PACK 626129 non-null float64 18 CHURN 1077024 non-null int64 dtypes: float64(12), int64(2), object(5) memory usage: 156.1+ MB
Train.isnull().sum()
user_id 0 REGION 424337 TENURE 0 MONTANT 377885 FREQUENCE_RECH 377885 REVENUE 362355 ARPU_SEGMENT 362355 FREQUENCE 362355 DATA_VOLUME 529763 ON_NET 393174 ORANGE 447144 TIGO 644774 ZONE1 992126 ZONE2 1008230 MRG 0 REGULARITY 0 TOP_PACK 450895 FREQ_TOP_PACK 450895 CHURN 0 dtype: int64
# Finding the percentage of missing values
Train_missing_cols = (round(((Train.isnull().sum()/len(Train.index))*100),2).to_frame('null')).sort_values('null', ascending=False)
Train_missing_cols
| null | |
|---|---|
| ZONE2 | 93.61 |
| ZONE1 | 92.12 |
| TIGO | 59.87 |
| DATA_VOLUME | 49.19 |
| FREQ_TOP_PACK | 41.86 |
| TOP_PACK | 41.86 |
| ORANGE | 41.52 |
| REGION | 39.40 |
| ON_NET | 36.51 |
| MONTANT | 35.09 |
| FREQUENCE_RECH | 35.09 |
| REVENUE | 33.64 |
| ARPU_SEGMENT | 33.64 |
| FREQUENCE | 33.64 |
| REGULARITY | 0.00 |
| user_id | 0.00 |
| MRG | 0.00 |
| TENURE | 0.00 |
| CHURN | 0.00 |
# Dropping columns with missing values above 50%
Train.drop(columns = ['ZONE2','ZONE1','TIGO','ORANGE','ON_NET','REGION','TOP_PACK', 'MRG', 'REVENUE'], inplace = True)
Train
| user_id | TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | CHURN | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7ee9e11e342e27c70455960acc80d3f91c1286d1 | K > 24 month | 20000.0 | 47.0 | 7201.0 | 52.0 | 8835.0 | 62 | 30.0 | 0 |
| 1 | 50443f42bdc92b10388fc56e520e4421a5fa655c | K > 24 month | NaN | NaN | NaN | NaN | NaN | 3 | NaN | 0 |
| 2 | da90b5c1a9b204c186079f89969aa01cb03c91b2 | K > 24 month | NaN | NaN | NaN | NaN | NaN | 1 | NaN | 0 |
| 3 | 364ec1b424cdc64c25441a444a16930289a0051e | K > 24 month | 7900.0 | 19.0 | 2632.0 | 25.0 | 9385.0 | 61 | 7.0 | 0 |
| 4 | d5a5247005bc6d41d3d99f4ef312ebb5f640f2cb | K > 24 month | 12350.0 | 21.0 | 4117.0 | 29.0 | 9360.0 | 56 | 11.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1077019 | 56e22fe3312a48cf860b043a021dab275383a20a | K > 24 month | NaN | NaN | NaN | NaN | NaN | 16 | NaN | 0 |
| 1077020 | 587b72930e4d36b3fb94a18c3ef65c2079460eee | K > 24 month | 2500.0 | 5.0 | 833.0 | 5.0 | 0.0 | 34 | 2.0 | 0 |
| 1077021 | d6831b8edacb7d9928b3f053fb1283574577ae42 | K > 24 month | NaN | NaN | NaN | NaN | NaN | 3 | NaN | 1 |
| 1077022 | cb1f6510b084173c0deae49095f35ac29f916701 | K > 24 month | 600.0 | 1.0 | 200.0 | 1.0 | 591.0 | 16 | 1.0 | 0 |
| 1077023 | 28f56fd0d9f8d8647bb6c62e7a3f4f35f49f4d6f | K > 24 month | 1500.0 | 4.0 | 500.0 | 5.0 | 1265.0 | 50 | 2.0 | 0 |
1077024 rows × 10 columns
Test.head()
| user_id | REGION | TENURE | MONTANT | FREQUENCE_RECH | REVENUE | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | ON_NET | ORANGE | TIGO | ZONE1 | ZONE2 | MRG | REGULARITY | TOP_PACK | FREQ_TOP_PACK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 51fe4c3347db1f8571d18ac03f716c41acee30a4 | MATAM | I 18-21 month | 2500.0 | 5.0 | 2500.0 | 833.0 | 5.0 | 0.0 | 64.0 | 70.0 | NaN | NaN | NaN | NO | 35 | All-net 500F=2000F;5d | 5.0 |
| 1 | 5ad5d67c175bce107cc97b98c4e37dcc38aa7f3e | NaN | K > 24 month | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NO | 2 | NaN | NaN |
| 2 | 5a4db591c953a8d8f373877fad37aaf4268899a1 | NaN | K > 24 month | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NO | 22 | NaN | NaN |
| 3 | 8bf9b4d8880aeba1c9a0da48be78f12e629be37c | NaN | K > 24 month | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NO | 6 | NaN | NaN |
| 4 | c7cdf2af01e9fa95bf498b68c122aa4b9a8d10df | SAINT-LOUIS | K > 24 month | 5100.0 | 7.0 | 5637.0 | 1879.0 | 15.0 | 7783.0 | 30.0 | 24.0 | 0.0 | 0.0 | NaN | NO | 60 | Data:1000F=2GB,30d | 4.0 |
Test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 190063 entries, 0 to 190062 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 190063 non-null object 1 REGION 115330 non-null object 2 TENURE 190063 non-null object 3 MONTANT 123695 non-null float64 4 FREQUENCE_RECH 123695 non-null float64 5 REVENUE 126422 non-null float64 6 ARPU_SEGMENT 126422 non-null float64 7 FREQUENCE 126422 non-null float64 8 DATA_VOLUME 96716 non-null float64 9 ON_NET 120771 non-null float64 10 ORANGE 111417 non-null float64 11 TIGO 76555 non-null float64 12 ZONE1 14850 non-null float64 13 ZONE2 12011 non-null float64 14 MRG 190063 non-null object 15 REGULARITY 190063 non-null int64 16 TOP_PACK 110773 non-null object 17 FREQ_TOP_PACK 110773 non-null float64 dtypes: float64(12), int64(1), object(5) memory usage: 26.1+ MB
Test.drop(columns = ['ZONE2','ZONE1','TIGO','ORANGE','ON_NET','REGION','TOP_PACK','MRG', 'REVENUE'], inplace = True)
Test
| user_id | TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 51fe4c3347db1f8571d18ac03f716c41acee30a4 | I 18-21 month | 2500.0 | 5.0 | 833.0 | 5.0 | 0.0 | 35 | 5.0 |
| 1 | 5ad5d67c175bce107cc97b98c4e37dcc38aa7f3e | K > 24 month | NaN | NaN | NaN | NaN | NaN | 2 | NaN |
| 2 | 5a4db591c953a8d8f373877fad37aaf4268899a1 | K > 24 month | NaN | NaN | NaN | NaN | 0.0 | 22 | NaN |
| 3 | 8bf9b4d8880aeba1c9a0da48be78f12e629be37c | K > 24 month | NaN | NaN | NaN | NaN | NaN | 6 | NaN |
| 4 | c7cdf2af01e9fa95bf498b68c122aa4b9a8d10df | K > 24 month | 5100.0 | 7.0 | 1879.0 | 15.0 | 7783.0 | 60 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190058 | 1092956a3dad77ceb7e8d7c70e3e13f77b60e2aa | F 9-12 month | 500.0 | 1.0 | 169.0 | 1.0 | 0.0 | 14 | 1.0 |
| 190059 | bec10becca7faa8e9cab9981b3aee5e9a7f04828 | K > 24 month | 2000.0 | 4.0 | 667.0 | 5.0 | 0.0 | 29 | 3.0 |
| 190060 | 7ac6fc191f8732b1b146e57f9ede983626b93eae | K > 24 month | NaN | NaN | NaN | NaN | NaN | 2 | NaN |
| 190061 | d592c81971d6120b0d19f9ace85f278ea21b89a0 | K > 24 month | 300.0 | 2.0 | 99.0 | 2.0 | 0.0 | 12 | NaN |
| 190062 | 9b4f57aeef72cd52c634498d0ea27ea3fbb67bf4 | K > 24 month | NaN | NaN | NaN | NaN | NaN | 2 | NaN |
190063 rows × 9 columns
Train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1077024 entries, 0 to 1077023 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 1077024 non-null object 1 TENURE 1077024 non-null object 2 MONTANT 699139 non-null float64 3 FREQUENCE_RECH 699139 non-null float64 4 ARPU_SEGMENT 714669 non-null float64 5 FREQUENCE 714669 non-null float64 6 DATA_VOLUME 547261 non-null float64 7 REGULARITY 1077024 non-null int64 8 FREQ_TOP_PACK 626129 non-null float64 9 CHURN 1077024 non-null int64 dtypes: float64(6), int64(2), object(2) memory usage: 82.2+ MB
Train.FREQUENCE_RECH.unique()
array([ 47., nan, 19., 21., 2., 1., 13., 3., 5., 14., 7.,
20., 15., 4., 16., 11., 8., 23., 18., 17., 12., 27.,
50., 40., 9., 6., 32., 39., 33., 38., 48., 60., 26.,
49., 10., 44., 22., 30., 28., 46., 52., 25., 45., 24.,
36., 62., 37., 58., 29., 69., 51., 31., 64., 35., 56.,
85., 73., 53., 55., 34., 57., 43., 72., 70., 65., 68.,
61., 75., 41., 42., 54., 76., 80., 71., 83., 82., 81.,
59., 74., 67., 63., 66., 93., 78., 77., 84., 79., 87.,
88., 92., 94., 110., 86., 95., 98., 89., 96., 101., 100.,
114., 91., 104., 90., 103., 97., 106., 109., 99., 117., 113.,
112., 121., 115., 108., 111., 119., 118., 102., 105., 131.])
Train.REGULARITY.unique()
array([62, 3, 1, 61, 56, 6, 10, 2, 16, 43, 8, 18, 12, 24, 54, 29, 14,
40, 23, 52, 60, 34, 13, 46, 47, 31, 9, 58, 38, 36, 19, 5, 39, 55,
27, 4, 32, 26, 17, 48, 22, 53, 59, 20, 44, 7, 41, 49, 33, 42, 28,
30, 15, 57, 21, 11, 45, 50, 25, 37, 51, 35])
# Filling in missing values in the train dataset
Train['MONTANT'].fillna((Train['MONTANT'].mean()), inplace=True)
Train['FREQUENCE_RECH'].fillna((Train['FREQUENCE_RECH'].mean()), inplace=True)
#Train['REVENUE'].fillna((Train['REVENUE'].mean()), inplace=True)
Train['ARPU_SEGMENT'].fillna((Train['ARPU_SEGMENT'].mean()), inplace=True)
Train['FREQUENCE'].fillna((Train['FREQUENCE'].mean()), inplace=True)
Train['DATA_VOLUME'].fillna((Train['DATA_VOLUME'].mean()), inplace=True)
Train['FREQ_TOP_PACK'].fillna((Train['FREQ_TOP_PACK'].mean()), inplace=True)
# Checking for null values in the Train dataset
Train.isnull().sum()
user_id 0 TENURE 0 MONTANT 0 FREQUENCE_RECH 0 ARPU_SEGMENT 0 FREQUENCE 0 DATA_VOLUME 0 REGULARITY 0 FREQ_TOP_PACK 0 CHURN 0 dtype: int64
# Filling in missing values
Test['MONTANT'].fillna((Test['MONTANT'].mean()), inplace=True)
Test['FREQUENCE_RECH'].fillna((Test['FREQUENCE_RECH'].mean()), inplace=True)
#Test['REVENUE'].fillna((Test['REVENUE'].mean()), inplace=True)
Test['ARPU_SEGMENT'].fillna((Test['ARPU_SEGMENT'].mean()), inplace=True)
Test['FREQUENCE'].fillna((Test['FREQUENCE'].mean()), inplace=True)
Test['DATA_VOLUME'].fillna((Test['DATA_VOLUME'].mean()), inplace=True)
Test['FREQ_TOP_PACK'].fillna((Test['FREQ_TOP_PACK'].mean()), inplace=True)
sns.set(rc = {'figure.figsize':(10,5)})
sns.boxplot(Train['ARPU_SEGMENT'])
/Users/Admin/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='ARPU_SEGMENT'>
sns.set(rc = {'figure.figsize':(10,5)})
sns.boxplot(Train['REGULARITY'])
/Users/Admin/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='REGULARITY'>
sns.set(rc = {'figure.figsize':(10,5)})
sns.boxplot(Train['ARPU_SEGMENT'])
/Users/Admin/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='ARPU_SEGMENT'>
Train[['ARPU_SEGMENT']].describe()
| ARPU_SEGMENT | |
|---|---|
| count | 1.077024e+06 |
| mean | 1.835356e+03 |
| std | 1.948397e+03 |
| min | 0.000000e+00 |
| 25% | 6.330000e+02 |
| 50% | 1.835356e+03 |
| 75% | 1.835356e+03 |
| max | 1.773920e+05 |
Train['ARPU_SEGMENT'].quantile(0.25)
633.0
Train['ARPU_SEGMENT'].quantile(0.75)
1835.3559605915466
Q1 = Train['ARPU_SEGMENT'].quantile(0.25)
Q3 = Train['ARPU_SEGMENT'].quantile(0.75)
IQR = Q3 - Q1
IQR
1202.3559605915466
lower_lim = Q1 - 1.5* IQR
upper_lim = Q3 + 1.5* IQR
lower_lim
-1170.53394088732
upper_lim
3638.8899014788667
outliers_15_low = (Train['ARPU_SEGMENT']< lower_lim)
outliers_15_up = (Train['ARPU_SEGMENT']> upper_lim)
len(Train['ARPU_SEGMENT']) - (len(Train['ARPU_SEGMENT'][outliers_15_low] ) + len(Train['ARPU_SEGMENT'][outliers_15_up] ))
970967
Train['ARPU_SEGMENT'][(outliers_15_low |outliers_15_up)]
0 7201.0
4 4117.0
18 24093.0
35 4333.0
47 4690.0
...
1076979 4366.0
1076989 4364.0
1077011 8830.0
1077013 4233.0
1077016 5396.0
Name: ARPU_SEGMENT, Length: 106057, dtype: float64
Train['ARPU_SEGMENT'][~(outliers_15_low |outliers_15_up)]
1 1835.355961
2 1835.355961
3 2632.000000
5 667.000000
6 1000.000000
...
1077019 1835.355961
1077020 833.000000
1077021 1835.355961
1077022 200.000000
1077023 500.000000
Name: ARPU_SEGMENT, Length: 970967, dtype: float64
Train
| user_id | TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | CHURN | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7ee9e11e342e27c70455960acc80d3f91c1286d1 | K > 24 month | 20000.000000 | 47.000000 | 7201.000000 | 52.000000 | 8835.000000 | 62 | 30.000000 | 0 |
| 1 | 50443f42bdc92b10388fc56e520e4421a5fa655c | K > 24 month | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 3 | 9.262446 | 0 |
| 2 | da90b5c1a9b204c186079f89969aa01cb03c91b2 | K > 24 month | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 1 | 9.262446 | 0 |
| 3 | 364ec1b424cdc64c25441a444a16930289a0051e | K > 24 month | 7900.000000 | 19.000000 | 2632.000000 | 25.000000 | 9385.000000 | 61 | 7.000000 | 0 |
| 4 | d5a5247005bc6d41d3d99f4ef312ebb5f640f2cb | K > 24 month | 12350.000000 | 21.000000 | 4117.000000 | 29.000000 | 9360.000000 | 56 | 11.000000 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1077019 | 56e22fe3312a48cf860b043a021dab275383a20a | K > 24 month | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 16 | 9.262446 | 0 |
| 1077020 | 587b72930e4d36b3fb94a18c3ef65c2079460eee | K > 24 month | 2500.000000 | 5.000000 | 833.000000 | 5.000000 | 0.000000 | 34 | 2.000000 | 0 |
| 1077021 | d6831b8edacb7d9928b3f053fb1283574577ae42 | K > 24 month | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 3 | 9.262446 | 1 |
| 1077022 | cb1f6510b084173c0deae49095f35ac29f916701 | K > 24 month | 600.000000 | 1.000000 | 200.000000 | 1.000000 | 591.000000 | 16 | 1.000000 | 0 |
| 1077023 | 28f56fd0d9f8d8647bb6c62e7a3f4f35f49f4d6f | K > 24 month | 1500.000000 | 4.000000 | 500.000000 | 5.000000 | 1265.000000 | 50 | 2.000000 | 0 |
1077024 rows × 10 columns
Train = Train[~(outliers_15_low |outliers_15_up)]
Train
| user_id | TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | CHURN | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 50443f42bdc92b10388fc56e520e4421a5fa655c | K > 24 month | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 3 | 9.262446 | 0 |
| 2 | da90b5c1a9b204c186079f89969aa01cb03c91b2 | K > 24 month | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 1 | 9.262446 | 0 |
| 3 | 364ec1b424cdc64c25441a444a16930289a0051e | K > 24 month | 7900.000000 | 19.000000 | 2632.000000 | 25.000000 | 9385.000000 | 61 | 7.000000 | 0 |
| 5 | b91a81e590d0dd029d99002d4226b9ca2cdf3cbc | E 6-9 month | 2000.000000 | 2.000000 | 667.000000 | 3.000000 | 4206.000000 | 62 | 1.000000 | 0 |
| 6 | 42335a2a6a2a4cd30452ce9bc611bd66b3a9d4c2 | K > 24 month | 3000.000000 | 1.000000 | 1000.000000 | 1.000000 | 3368.801722 | 6 | 1.000000 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1077019 | 56e22fe3312a48cf860b043a021dab275383a20a | K > 24 month | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 16 | 9.262446 | 0 |
| 1077020 | 587b72930e4d36b3fb94a18c3ef65c2079460eee | K > 24 month | 2500.000000 | 5.000000 | 833.000000 | 5.000000 | 0.000000 | 34 | 2.000000 | 0 |
| 1077021 | d6831b8edacb7d9928b3f053fb1283574577ae42 | K > 24 month | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 3 | 9.262446 | 1 |
| 1077022 | cb1f6510b084173c0deae49095f35ac29f916701 | K > 24 month | 600.000000 | 1.000000 | 200.000000 | 1.000000 | 591.000000 | 16 | 1.000000 | 0 |
| 1077023 | 28f56fd0d9f8d8647bb6c62e7a3f4f35f49f4d6f | K > 24 month | 1500.000000 | 4.000000 | 500.000000 | 5.000000 | 1265.000000 | 50 | 2.000000 | 0 |
970967 rows × 10 columns
sns.boxplot(x = Train['ARPU_SEGMENT'])
<AxesSubplot:xlabel='ARPU_SEGMENT'>
#Checking if predictor class is balanced.
print(Train['CHURN'].value_counts())
plt.figure(figsize=(10,5))
Train['CHURN'].value_counts(normalize=True).plot(kind='bar')
plt.ylabel('counts')
plt.xlabel('Churn')
0 770163 1 200804 Name: CHURN, dtype: int64
Text(0.5, 0, 'Churn')
# Checking the correlation
Train.corr()
| MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | CHURN | |
|---|---|---|---|---|---|---|---|---|
| MONTANT | 1.000000 | 0.730147 | 0.846969 | 0.664758 | 0.169163 | 0.091168 | 0.526761 | 0.145531 |
| FREQUENCE_RECH | 0.730147 | 1.000000 | 0.713643 | 0.884401 | 0.107192 | 0.179278 | 0.758561 | 0.078075 |
| ARPU_SEGMENT | 0.846969 | 0.713643 | 1.000000 | 0.766184 | 0.185431 | 0.116494 | 0.551149 | 0.143347 |
| FREQUENCE | 0.664758 | 0.884401 | 0.766184 | 1.000000 | 0.123771 | 0.230550 | 0.723220 | 0.051828 |
| DATA_VOLUME | 0.169163 | 0.107192 | 0.185431 | 0.123771 | 1.000000 | 0.057755 | 0.085620 | 0.026293 |
| REGULARITY | 0.091168 | 0.179278 | 0.116494 | 0.230550 | 0.057755 | 1.000000 | 0.039928 | -0.469006 |
| FREQ_TOP_PACK | 0.526761 | 0.758561 | 0.551149 | 0.723220 | 0.085620 | 0.039928 | 1.000000 | 0.105240 |
| CHURN | 0.145531 | 0.078075 | 0.143347 | 0.051828 | 0.026293 | -0.469006 | 0.105240 | 1.000000 |
# Heat map of correlation.
sns.set(rc = {'figure.figsize':(9,9)})
sns.heatmap(Train.corr(), annot =True)
<AxesSubplot:>
#Train['CHURN']= Train['CHURN'].replace(0,'Not Churned')
#Train['CHURN']= Train['CHURN'].replace(1,'Churned')
#Train
# plotting the counts of unique values of the Tenure column.
print(Train['TENURE'].value_counts())
plt.figure(figsize=(10,5))
Train['TENURE'].value_counts(normalize=True).plot(kind='bar')
plt.ylabel('counts')
plt.xlabel('TENURE')
K > 24 month 919907 I 18-21 month 20779 H 15-18 month 12033 G 12-15 month 7039 J 21-24 month 5764 F 9-12 month 4235 E 6-9 month 829 D 3-6 month 381 Name: TENURE, dtype: int64
Text(0.5, 0, 'TENURE')
sns.set(rc = {'figure.figsize': (10,6)})
sns.countplot(data = Train, x= 'TENURE', hue = 'CHURN', palette = 'inferno_r')
<AxesSubplot:xlabel='TENURE', ylabel='count'>
# Visualizing the relationship with Tenure and churn.
sns.set(rc = {'figure.figsize': (10,6)})
sns.countplot(data = Train, x= 'TENURE', hue = 'CHURN', palette = 'twilight')
<AxesSubplot:xlabel='TENURE', ylabel='count'>
sns.histplot(data=Train, x="ARPU_SEGMENT", hue = 'CHURN', bins = 10)
<AxesSubplot:xlabel='ARPU_SEGMENT', ylabel='Count'>
sns.histplot(data=Train, x="REGULARITY", hue = 'CHURN', bins = 10)
<AxesSubplot:xlabel='REGULARITY', ylabel='Count'>
#sns.set(rc = {'figure.figsize': (20,10)})
#sns.displot(Train, x="REGULARITY", hue = 'CHURN' , bins=60)
# checking for unique vales for dataset
Train.nunique()
user_id 970967 TENURE 8 MONTANT 2336 FREQUENCE_RECH 84 ARPU_SEGMENT 3640 FREQUENCE 91 DATA_VOLUME 21150 REGULARITY 62 FREQ_TOP_PACK 98 CHURN 2 dtype: int64
# Stating the categorical data
cat_cols = Train.select_dtypes(include=["object"]).columns.to_list()
cat_cols
['user_id', 'TENURE']
# Dropping the ID column
Train.drop(columns = ['user_id'], inplace = True)
/var/folders/xr/z3nw6rbs7k5g_8qmkl696c4w0000gp/T/ipykernel_73051/694804483.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Train.drop(columns = ['user_id'], inplace = True)
# Applying label encoding to the Tenure column
le = LabelEncoder()
Train['TENURE'] = le.fit_transform(Train['TENURE'])
/var/folders/xr/z3nw6rbs7k5g_8qmkl696c4w0000gp/T/ipykernel_73051/3162494724.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Train['TENURE'] = le.fit_transform(Train['TENURE'])
Train
| TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | CHURN | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 7 | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 3 | 9.262446 | 0 |
| 2 | 7 | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 1 | 9.262446 | 0 |
| 3 | 7 | 7900.000000 | 19.000000 | 2632.000000 | 25.000000 | 9385.000000 | 61 | 7.000000 | 0 |
| 5 | 1 | 2000.000000 | 2.000000 | 667.000000 | 3.000000 | 4206.000000 | 62 | 1.000000 | 0 |
| 6 | 7 | 3000.000000 | 1.000000 | 1000.000000 | 1.000000 | 3368.801722 | 6 | 1.000000 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1077019 | 7 | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 16 | 9.262446 | 0 |
| 1077020 | 7 | 2500.000000 | 5.000000 | 833.000000 | 5.000000 | 0.000000 | 34 | 2.000000 | 0 |
| 1077021 | 7 | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 3 | 9.262446 | 1 |
| 1077022 | 7 | 600.000000 | 1.000000 | 200.000000 | 1.000000 | 591.000000 | 16 | 1.000000 | 0 |
| 1077023 | 7 | 1500.000000 | 4.000000 | 500.000000 | 5.000000 | 1265.000000 | 50 | 2.000000 | 0 |
970967 rows × 9 columns
Test
| user_id | TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 51fe4c3347db1f8571d18ac03f716c41acee30a4 | I 18-21 month | 2500.000000 | 5.00000 | 833.000000 | 5.00000 | 0.000000 | 35 | 5.000000 |
| 1 | 5ad5d67c175bce107cc97b98c4e37dcc38aa7f3e | K > 24 month | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
| 2 | 5a4db591c953a8d8f373877fad37aaf4268899a1 | K > 24 month | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 0.000000 | 22 | 9.234326 |
| 3 | 8bf9b4d8880aeba1c9a0da48be78f12e629be37c | K > 24 month | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 6 | 9.234326 |
| 4 | c7cdf2af01e9fa95bf498b68c122aa4b9a8d10df | K > 24 month | 5100.000000 | 7.00000 | 1879.000000 | 15.00000 | 7783.000000 | 60 | 4.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190058 | 1092956a3dad77ceb7e8d7c70e3e13f77b60e2aa | F 9-12 month | 500.000000 | 1.00000 | 169.000000 | 1.00000 | 0.000000 | 14 | 1.000000 |
| 190059 | bec10becca7faa8e9cab9981b3aee5e9a7f04828 | K > 24 month | 2000.000000 | 4.00000 | 667.000000 | 5.00000 | 0.000000 | 29 | 3.000000 |
| 190060 | 7ac6fc191f8732b1b146e57f9ede983626b93eae | K > 24 month | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
| 190061 | d592c81971d6120b0d19f9ace85f278ea21b89a0 | K > 24 month | 300.000000 | 2.00000 | 99.000000 | 2.00000 | 0.000000 | 12 | 9.234326 |
| 190062 | 9b4f57aeef72cd52c634498d0ea27ea3fbb67bf4 | K > 24 month | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
190063 rows × 9 columns
# Applying label encoding to the Tenure column
le = LabelEncoder()
Test['TENURE'] = le.fit_transform(Test['TENURE'])
Test
| user_id | TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 51fe4c3347db1f8571d18ac03f716c41acee30a4 | 5 | 2500.000000 | 5.00000 | 833.000000 | 5.00000 | 0.000000 | 35 | 5.000000 |
| 1 | 5ad5d67c175bce107cc97b98c4e37dcc38aa7f3e | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
| 2 | 5a4db591c953a8d8f373877fad37aaf4268899a1 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 0.000000 | 22 | 9.234326 |
| 3 | 8bf9b4d8880aeba1c9a0da48be78f12e629be37c | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 6 | 9.234326 |
| 4 | c7cdf2af01e9fa95bf498b68c122aa4b9a8d10df | 7 | 5100.000000 | 7.00000 | 1879.000000 | 15.00000 | 7783.000000 | 60 | 4.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190058 | 1092956a3dad77ceb7e8d7c70e3e13f77b60e2aa | 2 | 500.000000 | 1.00000 | 169.000000 | 1.00000 | 0.000000 | 14 | 1.000000 |
| 190059 | bec10becca7faa8e9cab9981b3aee5e9a7f04828 | 7 | 2000.000000 | 4.00000 | 667.000000 | 5.00000 | 0.000000 | 29 | 3.000000 |
| 190060 | 7ac6fc191f8732b1b146e57f9ede983626b93eae | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
| 190061 | d592c81971d6120b0d19f9ace85f278ea21b89a0 | 7 | 300.000000 | 2.00000 | 99.000000 | 2.00000 | 0.000000 | 12 | 9.234326 |
| 190062 | 9b4f57aeef72cd52c634498d0ea27ea3fbb67bf4 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
190063 rows × 9 columns
# Dropping the ID column
Test.drop(columns =['user_id'], inplace = True)
Test
| TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | |
|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 2500.000000 | 5.00000 | 833.000000 | 5.00000 | 0.000000 | 35 | 5.000000 |
| 1 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
| 2 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 0.000000 | 22 | 9.234326 |
| 3 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 6 | 9.234326 |
| 4 | 7 | 5100.000000 | 7.00000 | 1879.000000 | 15.00000 | 7783.000000 | 60 | 4.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190058 | 2 | 500.000000 | 1.00000 | 169.000000 | 1.00000 | 0.000000 | 14 | 1.000000 |
| 190059 | 7 | 2000.000000 | 4.00000 | 667.000000 | 5.00000 | 0.000000 | 29 | 3.000000 |
| 190060 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
| 190061 | 7 | 300.000000 | 2.00000 | 99.000000 | 2.00000 | 0.000000 | 12 | 9.234326 |
| 190062 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
190063 rows × 8 columns
# Splitting data
# Defining the target & predictor variables
X = Train.drop(columns = ["CHURN"])
y = Train["CHURN"]
# Splitting the dataframe into train and test
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.25, random_state = 1, stratify = y)
# Joining train dataset for modelling
train_data = X_train.join(y_train, on = X_train.index)
train_data.head()
| TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | CHURN | |
|---|---|---|---|---|---|---|---|---|---|
| 850510 | 7 | 9100.000000 | 12.000000 | 2223.000000 | 16.000000 | 3368.801722 | 60 | 5.000000 | 0 |
| 638215 | 5 | 600.000000 | 2.000000 | 200.000000 | 2.000000 | 229.000000 | 23 | 1.000000 | 0 |
| 635165 | 7 | 1000.000000 | 2.000000 | 333.000000 | 2.000000 | 1.000000 | 61 | 2.000000 | 0 |
| 194282 | 7 | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 1 | 9.262446 | 1 |
| 631158 | 7 | 1600.000000 | 4.000000 | 534.000000 | 6.000000 | 1216.000000 | 22 | 1.000000 | 0 |
train_data.shape
(728225, 9)
# Joining test dataset for modelling
test_data = X_test.join(y_test, on = X_test.index)
test_data.head()
| TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | CHURN | |
|---|---|---|---|---|---|---|---|---|---|
| 733032 | 7 | 3000.000000 | 4.000000 | 739.000000 | 7.000000 | 0.000000 | 22 | 1.000000 | 0 |
| 404607 | 7 | 9500.000000 | 15.000000 | 3144.000000 | 15.000000 | 34383.000000 | 60 | 8.000000 | 0 |
| 723217 | 7 | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 1 | 9.262446 | 1 |
| 858083 | 7 | 3000.000000 | 4.000000 | 933.000000 | 6.000000 | 2374.000000 | 41 | 2.000000 | 0 |
| 82115 | 7 | 5529.210895 | 11.523756 | 1835.355961 | 13.974439 | 3368.801722 | 1 | 9.262446 | 1 |
test_data.shape
(242742, 9)
# Applying a count to know the number of churned and not churned customers
count_not_churned, count_churned = train_data['CHURN'].value_counts()
count_not_churned, count_churned
(577622, 150603)
count_churned
150603
count_not_churned
577622
# Performing splits to balance the data
X = train_data.drop('CHURN',axis='columns')
y = train_data['CHURN']
# Using the SMOTE balancing to balance data
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()
0 577622 1 577622 Name: CHURN, dtype: int64
# Further splitting of balanced data
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.2,random_state=1, stratify=y_sm)
#Scaling the train_data set
num_cols = ['MONTANT', 'FREQUENCE_RECH', 'ARPU_SEGMENT', 'FREQUENCE',
'DATA_VOLUME','REGULARITY', 'FREQ_TOP_PACK']
scaler = MinMaxScaler()
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
#Scaling the test_data set
num_cols = ['MONTANT', 'FREQUENCE_RECH', 'ARPU_SEGMENT', 'FREQUENCE',
'DATA_VOLUME','REGULARITY', 'FREQ_TOP_PACK']
scaler = MinMaxScaler()
test_data[num_cols] = scaler.fit_transform(test_data[num_cols])
test_data
| TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | CHURN | |
|---|---|---|---|---|---|---|---|---|---|
| 733032 | 7 | 0.027553 | 0.036585 | 0.203134 | 0.068966 | 0.000000 | 0.344262 | 0.000000 | 0 |
| 404607 | 7 | 0.087752 | 0.170732 | 0.864211 | 0.160920 | 0.037587 | 0.967213 | 0.071429 | 0 |
| 723217 | 7 | 0.050977 | 0.128338 | 0.504496 | 0.149131 | 0.003683 | 0.000000 | 0.084311 | 1 |
| 858083 | 7 | 0.027553 | 0.036585 | 0.256460 | 0.057471 | 0.002595 | 0.655738 | 0.010204 | 0 |
| 82115 | 7 | 0.050977 | 0.128338 | 0.504496 | 0.149131 | 0.003683 | 0.000000 | 0.084311 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 716575 | 7 | 0.009956 | 0.024390 | 0.100880 | 0.011494 | 0.003683 | 0.081967 | 0.000000 | 0 |
| 125423 | 7 | 0.070618 | 0.292683 | 0.723749 | 0.310345 | 0.007160 | 0.721311 | 0.214286 | 0 |
| 920509 | 7 | 0.000695 | 0.000000 | 0.009071 | 0.000000 | 0.000001 | 0.016393 | 0.000000 | 1 |
| 796938 | 7 | 0.050977 | 0.128338 | 0.504496 | 0.149131 | 0.003683 | 0.000000 | 0.084311 | 1 |
| 116239 | 4 | 0.103496 | 0.231707 | 0.943650 | 0.206897 | 0.000851 | 0.672131 | 0.112245 | 0 |
242742 rows × 9 columns
Test
| TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | |
|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 2500.000000 | 5.00000 | 833.000000 | 5.00000 | 0.000000 | 35 | 5.000000 |
| 1 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
| 2 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 0.000000 | 22 | 9.234326 |
| 3 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 6 | 9.234326 |
| 4 | 7 | 5100.000000 | 7.00000 | 1879.000000 | 15.00000 | 7783.000000 | 60 | 4.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190058 | 2 | 500.000000 | 1.00000 | 169.000000 | 1.00000 | 0.000000 | 14 | 1.000000 |
| 190059 | 7 | 2000.000000 | 4.00000 | 667.000000 | 5.00000 | 0.000000 | 29 | 3.000000 |
| 190060 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
| 190061 | 7 | 300.000000 | 2.00000 | 99.000000 | 2.00000 | 0.000000 | 12 | 9.234326 |
| 190062 | 7 | 5506.547759 | 11.48394 | 1828.750281 | 13.93967 | 3358.421151 | 2 | 9.234326 |
190063 rows × 8 columns
Test[num_cols] = scaler.fit_transform(Test[num_cols])
Test
| TENURE | MONTANT | FREQUENCE_RECH | ARPU_SEGMENT | FREQUENCE | DATA_VOLUME | REGULARITY | FREQ_TOP_PACK | |
|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 0.012284 | 0.033613 | 0.014093 | 0.044444 | 0.000000 | 0.557377 | 0.010283 |
| 1 | 7 | 0.027207 | 0.088100 | 0.030940 | 0.143774 | 0.004449 | 0.016393 | 0.021168 |
| 2 | 7 | 0.027207 | 0.088100 | 0.030940 | 0.143774 | 0.000000 | 0.344262 | 0.021168 |
| 3 | 7 | 0.027207 | 0.088100 | 0.030940 | 0.143774 | 0.004449 | 0.081967 | 0.021168 |
| 4 | 7 | 0.025189 | 0.050420 | 0.031790 | 0.155556 | 0.010311 | 0.967213 | 0.007712 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 190058 | 2 | 0.002358 | 0.000000 | 0.002859 | 0.000000 | 0.000000 | 0.213115 | 0.000000 |
| 190059 | 7 | 0.009803 | 0.025210 | 0.011285 | 0.044444 | 0.000000 | 0.459016 | 0.005141 |
| 190060 | 7 | 0.027207 | 0.088100 | 0.030940 | 0.143774 | 0.004449 | 0.016393 | 0.021168 |
| 190061 | 7 | 0.001365 | 0.008403 | 0.001675 | 0.011111 | 0.000000 | 0.180328 | 0.021168 |
| 190062 | 7 | 0.027207 | 0.088100 | 0.030940 | 0.143774 | 0.004449 | 0.016393 | 0.021168 |
190063 rows × 8 columns
# Logistic Regression
log_reg = LogisticRegression()
log_reg_model = log_reg.fit(X_train_sm, y_train_sm)
warnings.filterwarnings('ignore')
## Feature Importance of the Random Forest Model
log_reg_importance = log_reg_model.coef_[0]
log_reg_importance = pd.DataFrame(log_reg_importance, index = X.columns)
log_reg_importance.reset_index(inplace = True)
log_reg_importance.rename(columns = {"index": "Feature", 0:"Score"}, inplace = True)
log_reg_importance.sort_values(by = "Score", ascending = False, inplace = True)
log_reg_importance
| Feature | Score | |
|---|---|---|
| 0 | TENURE | 0.045332 |
| 7 | FREQ_TOP_PACK | 0.020871 |
| 2 | FREQUENCE_RECH | 0.010269 |
| 3 | ARPU_SEGMENT | 0.000360 |
| 1 | MONTANT | 0.000095 |
| 5 | DATA_VOLUME | 0.000039 |
| 4 | FREQUENCE | -0.007249 |
| 6 | REGULARITY | -0.110106 |
y_pred = log_reg.predict(X_test_sm)
y_true = y_test_sm
# Confusion Matrix for Logistic regression model
cm_lr = confusion_matrix(y_true,y_pred)
f, ax = plt.subplots(figsize=(8,8))
sns.heatmap(cm_lr, annot=True, linewidth=0.5, fmt=".0f",cmap='RdPu',ax = ax)
plt.xlabel = ('y_pred')
plt.ylabel = ('y_true')
plt.show()
# Evaluating the model
smote_report = classification_report(y_true, y_pred, target_names= ["Stayed", "Churned"])
print(smote_report)
precision recall f1-score support
Stayed 0.86 0.75 0.80 115524
Churned 0.78 0.87 0.82 115525
accuracy 0.81 231049
macro avg 0.82 0.81 0.81 231049
weighted avg 0.82 0.81 0.81 231049
# Finding the F2 Score
log_reg_score = fbeta_score(y_true, y_pred, average='binary', beta= 0.9)
# Calculate and show the AUC and ROC
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
plt.plot(fpr, tpr)
#plt.xlabel("False Positive Rate")
#plt.ylabel("True Positive Rate")
plt.show()
print(f"{log_reg} AUC score: {roc_auc_score(y_true, y_pred)}")
LogisticRegression() AUC score: 0.8130004440086855
log_reg_auc = roc_auc_score(y_true, y_pred)
log_reg_auc
0.8130004440086855
# Specifying the Model name and setting default parameters
rfc = RandomForestClassifier(random_state= 24)
rfc = rfc.fit(X_train_sm, y_train_sm)
# Feature Importance of the Random Forest Model
rf_importance = rfc.feature_importances_
rf_importance = pd.DataFrame(rf_importance, columns = ["score"]).reset_index()
rf_importance["Feature"] = list(X.columns)
rf_importance.drop(columns = ["index"], inplace = True)
rf_importance.sort_values(by = "score", ascending = False, ignore_index = True, inplace = True)
## Visualizing the feature importances
fig = px.bar(rf_importance, x = "Feature", y = "score")
fig.show()
rfc_pred = rfc.predict(X_test_sm)
rfc_true = y_test_sm
# Confusion matrix for RandomForest classifier
cm_rfc = confusion_matrix(rfc_true,rfc_pred)
f, ax = plt.subplots(figsize=(8,8))
sns.heatmap(cm_rfc, annot=True, linewidth=0.5, fmt=".0f",cmap='RdPu',ax = ax)
plt.xlabel = ('rfc_pred')
plt.ylabel = ('rfc_true')
plt.show()
# Evaluating the model
smote_report = classification_report(rfc_true, rfc_pred, target_names= ["Stayed", "Churned"])
print(smote_report)
precision recall f1-score support
Stayed 0.89 0.80 0.84 115524
Churned 0.82 0.91 0.86 115525
accuracy 0.85 231049
macro avg 0.86 0.85 0.85 231049
weighted avg 0.86 0.85 0.85 231049
rfc_score = fbeta_score(rfc_true, rfc_pred, average='binary', beta= 0.9)
# Calculate and show the AUC and ROC
fpr, tpr, thresholds = roc_curve(rfc_true, rfc_pred)
plt.plot(fpr, tpr)
#plt.xlabel("False Positive Rate")
#plt.ylabel("True Positive Rate")
plt.show()
print(f"{rfc} AUC score: {roc_auc_score(rfc_true, rfc_pred)}")
RandomForestClassifier(random_state=24) AUC score: 0.852836180464006
rfc_auc = roc_auc_score(rfc_true, rfc_pred)
rfc_auc
0.852836180464006
# Specifying the Model name and setting default parameters
dt = DecisionTreeClassifier(random_state=100)
dt = dt.fit(X_train_sm, y_train_sm)
dt_pred = dt.predict(X_test_sm)
dt_true = y_test_sm
#Confusion Matrix for Decision Tree Classifier
cm_dt = confusion_matrix(dt_true,dt_pred)
f, ax = plt.subplots(figsize=(8,8))
sns.heatmap(cm_dt, annot=True, linewidth=0.5, fmt=".0f",cmap='RdPu',ax = ax)
plt.xlabel = ('dt_pred')
plt.ylabel = ('dt_true')
plt.show()
# Evaluating the model
smote_report = classification_report(dt_true, dt_pred, target_names= ["Stayed", "Churned"])
print(smote_report)
precision recall f1-score support
Stayed 0.89 0.77 0.83 115524
Churned 0.80 0.90 0.85 115525
accuracy 0.84 231049
macro avg 0.84 0.84 0.84 231049
weighted avg 0.84 0.84 0.84 231049
dt_score = fbeta_score(dt_true, dt_pred, average='binary', beta= 0.9)
# Calculate and show the AUC and ROC
fpr, tpr, thresholds = roc_curve(dt_true, dt_pred)
plt.plot(fpr, tpr)
#plt.xlabel("False Positive Rate")
#plt.ylabel("True Positive Rate")
plt.show()
print(f"{dt} AUC score: {roc_auc_score(dt_true, dt_pred)}")
DecisionTreeClassifier(random_state=100) AUC score: 0.8373026305639508
dt_auc = roc_auc_score(dt_true, dt_pred)
dt_auc
0.8373026305639508
# Specifying the Model name and setting default parameters
knn = KNeighborsClassifier(n_neighbors = 15)
knn = knn.fit(X_train_sm, y_train_sm)
knn_pred = knn.predict(X_test_sm)
knn_true = y_test_sm
# Confusion Matrix for KNN
cm_knn = confusion_matrix(knn_true,knn_pred)
f, ax = plt.subplots(figsize=(8,8))
sns.heatmap(cm_knn, annot=True, linewidth=0.5, fmt=".0f",cmap='RdPu',ax = ax)
plt.xlabel = ('knn_pred')
plt.ylabel = ('knn_true')
plt.show()
# Evaluating the model
smote_report = classification_report(knn_true, knn_pred, target_names= ["Stayed", "Churned"])
print(smote_report)
precision recall f1-score support
Stayed 0.86 0.77 0.81 115524
Churned 0.79 0.87 0.83 115525
accuracy 0.82 231049
macro avg 0.82 0.82 0.82 231049
weighted avg 0.82 0.82 0.82 231049
knn_score = fbeta_score(knn_true, knn_pred, average='binary', beta= 0.9)
# Calculating the AUC and ROC
fpr, tpr, thresholds = roc_curve(knn_true, knn_pred)
plt.plot(fpr, tpr)
plt.show()
print(f"{knn} AUC score: {roc_auc_score(knn_true, knn_pred)}")
KNeighborsClassifier(n_neighbors=15) AUC score: 0.8206698597497671
knn_auc = roc_auc_score(knn_true, knn_pred)
knn_auc
0.8206698597497671
# Comparing Model scores and select the best for hyperparameter tuning
model_scores = pd.DataFrame (\
{"Models":[str(log_reg),str(rfc),str(knn),str(dt)],
"AUC score":[log_reg_auc, rfc_auc,knn_auc, dt_auc]})
model_scores
| Models | AUC score | |
|---|---|---|
| 0 | LogisticRegression() | 0.813000 |
| 1 | RandomForestClassifier(random_state=24) | 0.852836 |
| 2 | KNeighborsClassifier(n_neighbors=15) | 0.820670 |
| 3 | DecisionTreeClassifier(random_state=100) | 0.837303 |
rand = RandomForestClassifier(bootstrap=True,criterion = "gini",
n_jobs=-1,
max_depth=15,
n_estimators=500,
random_state= 1,
min_samples_leaf = 1,
min_samples_split = 2,
verbose=True)
#Fit model on data
randmodel = rand.fit(X_train_sm,y_train_sm)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 11.0s [Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 59.8s [Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 2.5min [Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 2.8min finished
randpred = randmodel.predict(X_test_sm)
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers. [Parallel(n_jobs=8)]: Done 34 tasks | elapsed: 0.4s [Parallel(n_jobs=8)]: Done 184 tasks | elapsed: 2.0s [Parallel(n_jobs=8)]: Done 434 tasks | elapsed: 4.4s [Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed: 5.1s finished
print("Accuracy")
accuracy_score(y_test_sm, randpred)
Accuracy
0.8425615345662608
print("Recall")
recall_score(y_test_sm, randpred)
Recall
0.8886301666305995
print("F1 Score")
f1_score(y_test_sm, randpred)
F1 Score
0.849495643251384
# Evaluating the model
best_grid_rf_report = classification_report(y_test_sm, randpred, target_names=["Stayed", "Churned"])
print(best_grid_rf_report)
precision recall f1-score support
Stayed 0.88 0.80 0.83 115524
Churned 0.81 0.89 0.85 115525
accuracy 0.84 231049
macro avg 0.85 0.84 0.84 231049
weighted avg 0.85 0.84 0.84 231049
# Calculate and show the AUC and ROC
fpr, tpr, thresholds = roc_curve(y_test_sm, randpred)
plt.plot(fpr, tpr)
plt.show()
print(f"{randmodel} AUC score: {roc_auc_score(y_test_sm, randpred)}")
RandomForestClassifier(max_depth=15, n_estimators=500, n_jobs=-1,
random_state=1, verbose=True) AUC score: 0.8425613351763849
subpred = randmodel.predict(Test)
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers. [Parallel(n_jobs=8)]: Done 34 tasks | elapsed: 0.1s [Parallel(n_jobs=8)]: Done 184 tasks | elapsed: 1.2s [Parallel(n_jobs=8)]: Done 434 tasks | elapsed: 2.3s [Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed: 2.6s finished
subpred
array([0, 0, 0, ..., 0, 0, 0])
submission = pd.read_csv('/Users/Admin/Desktop/Churn Capstone/SampleSubmission.csv')
submission.head()
| user_id | CHURN | |
|---|---|---|
| 0 | 51fe4c3347db1f8571d18ac03f716c41acee30a4 | 0 |
| 1 | 5ad5d67c175bce107cc97b98c4e37dcc38aa7f3e | 0 |
| 2 | 5a4db591c953a8d8f373877fad37aaf4268899a1 | 0 |
| 3 | 8bf9b4d8880aeba1c9a0da48be78f12e629be37c | 0 |
| 4 | c7cdf2af01e9fa95bf498b68c122aa4b9a8d10df | 0 |
submission["CHURN"] = subpred
submission.head()
| user_id | CHURN | |
|---|---|---|
| 0 | 51fe4c3347db1f8571d18ac03f716c41acee30a4 | 0 |
| 1 | 5ad5d67c175bce107cc97b98c4e37dcc38aa7f3e | 0 |
| 2 | 5a4db591c953a8d8f373877fad37aaf4268899a1 | 0 |
| 3 | 8bf9b4d8880aeba1c9a0da48be78f12e629be37c | 0 |
| 4 | c7cdf2af01e9fa95bf498b68c122aa4b9a8d10df | 0 |
submission.to_csv('Sample_submission.csv', index=False)
requirements = '\n'.join(f'{m.__name__.replace("_","-").replace("sklearn","scikit-learn")}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None))
with open('requirements.txt', 'w') as f:
f.write(requirements)
print(requirements)
pandas==1.4.2 numpy==1.21.5 seaborn==0.11.2 pandas-profiling==3.4.0
to_export = {
"encoder": le,
"scaler": scaler,
"model": randmodel,
"pipeline": None,
}
import pickle
with open('ML_items', 'wb') as file:
Loaded_object = pickle.dump(to_export, file)
git clone https://huggingface.co/spaces/SampsonChris/Customer_churn_gradio
Input In [129] git clone https://huggingface.co/spaces/SampsonChris/Customer_churn_gradio ^ SyntaxError: invalid syntax